! pip install plotly
! pip install networkx
! pip install dash
import pandas as pd
import plotly.graph_objects as go
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
plt.rcParams["figure.figsize"] = (20,10)
import tqdm as tqdm
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))










books = []
for i in range(5):
books.append(pd.read_csv('book{}-edges.csv'.format(i+1)))
all_books = pd.concat(books)
all_books.head()
edges = all_books.groupby(['Source','Target']).agg({'weight':'sum'}).reset_index()
edges.sort_values('weight',ascending=False).head()
GOT = nx.from_pandas_edgelist(edges,
source='Source',
target='Target',
edge_attr='weight' )
print(nx.info(GOT))
weighted_degrees = dict(nx.degree(GOT,weight='weight'))
max_degree = max(weighted_degrees.values())
import seaborn as sns
h = plt.hist(weighted_degrees.values(), bins = 30)
subG = GOT.subgraph([n for n in weighted_degrees if weighted_degrees[n]>200])
print(nx.info(subG))
pos = nx.spring_layout(subG,weight='weight',iterations=20, k = 4)
plt.axis('off')
plt.title('Game of Thrones Network',fontsize = 24)
for node in subG.nodes():
size = 100*weighted_degrees[node]**0.5
ns = nx.draw_networkx_nodes(subG,pos,nodelist=[node], node_size=size, node_color='#009fe3')
ns.set_edgecolor('#f2f6fa')
nx.draw_networkx_labels(subG,pos,{n:n.replace('-','\n') for n in subG.nodes() if weighted_degrees[n]>100},font_size=10);
for e in subG.edges(data=True):
if e[2]['weight']>10:
nx.draw_networkx_edges(subG,pos,[e],width=e[2]['weight']/100,edge_color='#707070')
infection_times = {}
def independent_cascade(G,t,infection_times):
#doing a t->t+1 step of independent_cascade simulation
#each infectious node infects neigbors with probabilty proportional to the weight
max_weight = max([e[2]['weight'] for e in G.edges(data=True)])
current_infectious = [n for n in infection_times if infection_times[n]==t]
for n in current_infectious:
for v in G.neighbors(n):
if v not in infection_times:
if G.get_edge_data(n,v)['weight'] >= np.random.random()*max_weight:
infection_times[v] = t+1
return infection_times
def plot_G(G,pos,infection_times,t):
current_infectious = [n for n in infection_times if infection_times[n]==t]
plt.figure()
plt.axis('off')
plt.title('Game of Thrones Network, t={}'.format(t),fontsize = 24)
for node in G.nodes():
size = 100*weighted_degrees[node]**0.5
if node in current_infectious:
ns = nx.draw_networkx_nodes(G,pos,nodelist=[node], node_size=size, node_color='#feba02')
elif infection_times.get(node,9999999)<t:
ns = nx.draw_networkx_nodes(G,pos,nodelist=[node], node_size=size, node_color='#ff0000')
else:
ns = nx.draw_networkx_nodes(G,pos,nodelist=[node], node_size=size, node_color='#009fe3')
ns.set_edgecolor('#f2f6fa')
nx.draw_networkx_labels(G,pos,{n:n.replace('-','\n') for n in G.nodes() if weighted_degrees[n]>100},font_size=10);
for e in G.edges(data=True):
if e[2]['weight']>10:
nx.draw_networkx_edges(G,pos,[e],width=e[2]['weight']/100,edge_color='#707070')
infection_times = {'Bran-Stark':-1,'Samwell-Tarly':-1,'Jon-Snow':0}
for t in range(10):
plot_G(subG,pos,infection_times,t)
infection_times = independent_cascade(subG,t,infection_times)
top = pd.DataFrame.from_dict(dict(nx.degree(subG)),orient='index').sort_values(0,ascending=False)
top.columns = ['Degree']
top['Weighted Degree'] = pd.DataFrame.from_dict(dict(nx.degree(subG,weight='weight')),orient='index')
top['PageRank'] = pd.DataFrame.from_dict(dict(nx.pagerank_numpy(subG,weight='weight')),orient='index')
top['Betweenness'] = pd.DataFrame.from_dict(dict(nx.betweenness_centrality(subG,weight='weight')),orient='index')
top.head()
methods = top.columns
print(nx.info(subG))
print(nx.info(GOT))
max_budget = len(subG.nodes())
trials = 50
all_results = []
for budget in tqdm.tqdm_notebook(range(max_budget)):
results = {'budget':budget}
for method in methods:
infections = []
for i in range(trials):
infected = 0
t= 0
infection_times = {n:0 for n in top.sort_values(method,ascending=False).index[:budget]}
while len(infection_times)>infected:
#t+=1
infected = len(infection_times)
infection_times = independent_cascade(subG,t,infection_times)
t+=1
infections.append(infected)
results[method] = np.round(np.mean(infections)/len(subG.nodes()),2)
all_results.append(results)
res_df.index = res_df.index/len(subG.nodes())
res_df.head()
res_df.plot()
plt.legend(fontsize = 18)
plt.ylabel('Virality rate (out of total graph size)',fontsize = 18)
plt.xlabel('Seeding Budget (out of graph size)', fontsize = 18)
from itertools import product
budget=2
seed_sets = list(product(*[subG.nodes()]*budget))
print(len(seed_sets),'Seeding options')
budget = 2
trials = 20
all_results = []
results = {'budget':budget}
for seed in tqdm.tqdm_notebook(seed_sets[:]):
infections = []
for i in range(trials):
infected = 0
t= 0
infection_times = {n:0 for n in seed}
while len(infection_times)>infected:
#t+=1
infected = len(infection_times)
infection_times = independent_cascade(subG,t,infection_times)
t+=1
infections.append(infected)
results[seed] = np.round(np.mean(infections)/len(subG.nodes()),2)
all_results.append(results)
sorted(results.items(), key = lambda x: x[1], reverse=True)[:10]
book1 = pd.read_csv("book1.csv")
book1
book1Graph = nx.Graph()
for _, edge in book1.iterrows():
book1Graph.add_edge(edge['Source'], edge['Target'], weight=edge['Weight'])
allBooks = [book1Graph]
bookNames = ['book2.csv', 'book3.csv', 'book4.csv', 'book5.csv']
for bookName in bookNames:
book = pd.read_csv(bookName)
GBook = nx.Graph()
for _, edge in book.iterrows():
GBook.add_edge(edge['Source'], edge['Target'], weight=edge['weight'])
allBooks.append(GBook)
degOfCentrality = []
# degOfCentrailtySorted = []
# i = 0
for book in allBooks:
degocen = nx.degree_centrality(book)
degOfCentrality.append(degocen)
i = 1
for degOfOneBook in degOfCentrality:
degOfOneBookSorted = sorted(degOfOneBook.items(), key=lambda x:x[1], reverse=True)[0:10]
print("Book: {}".format(i))
i+=1
print(degOfOneBookSorted)
print("\n")
According to degree centrality, the most important character in the first book is Eddard Stark but he is not even in the top 10 of the fifth book. The importance of characters changes over the course of five books because, you know, stuff happens... ;)
Let's look at the evolution of degree centrality of a couple of characters like Eddard Stark, Jon Snow, and Tyrion, which showed up in the top 10 of degree centrality in the first book.
%matplotlib inline
# Creating a list of degree centrality of all the books
evol = [nx.degree_centrality(book) for book in allBooks]
# Creating a DataFrame from the list of degree centralities in all the books
degree_evol_df = pd.DataFrame.from_records(evol)
# Plotting the degree centrality evolution of Eddard-Stark, Tyrion-Lannister and Jon-Snow
degree_evol_df[['Eddard-Stark', 'Tyrion-Lannister', 'Jon-Snow']].plot()
We can see that the importance of Eddard Stark dies off as the book series progresses. With Jon Snow, there is a drop in the fourth book but a sudden rise in the fifth book.
Now let's look at various other measures like betweenness centrality and PageRank to find important characters in our Game of Thrones character co-occurrence network and see if we can uncover some more interesting facts about this network. Let's plot the evolution of betweenness centrality of this network over the five books. We will take the evolution of the top four characters of every book and plot it.
# Creating a list of betweenness centrality of all the books just like we did for degree centrality
evol = [nx.betweenness_centrality(book, weight='weight') for book in allBooks]
# Making a DataFrame from the list
betweenness_evol_df = pd.DataFrame.from_records(evol)
# Finding the top 4 characters in every book
set_of_char = set()
for i in range(5):
set_of_char |= set(list(betweenness_evol_df.T[i].sort_values(ascending=False)[0:4].index))
list_of_char = list(set_of_char)
# Plotting the evolution of the top characters
betweenness_evol_df[list_of_char].plot(figsize=(13, 7))
We see a peculiar rise in the importance of Stannis Baratheon over the books. In the fifth book, he is significantly more important than other characters in the network, even though he is the third most important character according to degree centrality.
PageRank was the initial way Google ranked web pages. It evaluates the inlinks and outlinks of webpages in the world wide web, which is, essentially, a directed network. Let's look at the importance of characters in the Game of Thrones network according to PageRank.
# Creating a list of pagerank of all the characters in all the books
evol = [nx.pagerank(book) for book in allBooks]
# Making a DataFrame from the list
pagerank_evol_df = pd.DataFrame.from_records(evol)
# Finding the top 4 characters in every book
set_of_char = set()
for i in range(5):
set_of_char |= set(list(pagerank_evol_df.T[i].sort_values(ascending=False)[0:4].index))
list_of_char = list(set_of_char)
# Plotting the top characters
pagerank_evol_df[list_of_char].plot(figsize=(13, 7))
Stannis, Jon Snow, and Daenerys are the most important characters in the all the books according to PageRank. Eddard Stark follows a similar curve but for degree centrality and betweenness centrality: He is important in the first book but dies into oblivion over the book series.
We have seen three different measures to calculate the importance of a node in a network, and all of them tells us something about the characters and their importance in the co-occurrence network. We see some names pop up in all three measures so maybe there is a strong correlation between them?
Let's look at the correlation between PageRank, betweenness centrality and degree centrality for all the books using Pearson correlation.
# Creating a list of pagerank, betweenness centrality, degree centrality
# of all the characters in all books.
allMeasures = []
allCorr = []
print("Correlation between PageRank, betweenness centrality and degree centrality")
for i in range(5):
measures = [nx.pagerank(allBooks[i]),
nx.betweenness_centrality(allBooks[i], weight='weight'),
nx.degree_centrality(allBooks[i])]
allMeasures.append(measures)
# Creating the correlation DataFrame
cor = pd.DataFrame.from_records(measures)
# Calculating the correlation
cor.T.corr()
allCorr.append(cor)
for i in range(5):
print("CooreBook: {}".format(i + 1))
print(allCorr[i].T.corr())
print("\n")
We see a high correlation between these three measures for our character co-occurrence network.
So we've been looking at different ways to find the important characters in the Game of Thrones co-occurrence network. According to degree centrality, Eddard Stark is the most important character initially in the books. But who is/are the most important character(s) in the fifth book according to these three measures?
# Finding the most important character in all the books,
# according to degree centrality, betweenness centrality and pagerank.
for i in range(5):
print("Important charactesr in Book: {}".format(i + 1))
p_rank, b_cent, d_cent = allCorr[i].idxmax(axis=1)
# Printing out the top character accoding to the three measures
print("Page Rank: ", p_rank, "\nBetweenness Centarlity: ", b_cent, "\nDegree Centrality: ", d_cent)
print("\n")